home *** CD-ROM | disk | FTP | other *** search
Text File | 1996-07-06 | 14.0 KB | 469 lines | [TEXT/CWIE] |
- // ===========================================================================
- // DMultiStringLocator.cp
- // ---------------------
- // ©1996 Eric Gundrum, All rights reserved.
- //
- // The contents of this file may be freely altered and freely distributed
- // in any form, provided this copyright statement is retained unaltered.
- // Add your own changes below.
- // ---------------------
- //
- // Based on source code provided in Practical Algorithms for Programmers,
- // by Binstock and Rex, published in 1995 by Addison Wesley.
- //
- // Simultaneously search text for multiple strings using the Aho/Corasick
- // algorithm.
- //
-
- // Mac Headers
- #include <Types.h> // for NULL
-
- // PowerPlant Headers
- #include <LListIterator.h>
- #include <UDebugging.h>
-
- // Library Headers
- #include "DMultiStringLocator.h"
-
-
- // ===========================================================================
- // ----------- public member function implementations ----------
- // ===========================================================================
- #pragma mark --- public members ---
- // ---------------------------------------------------------------------------
- // Constructor:
- // Create state tables used by the search.
- //
- // inStringList is a list of pointers to DSearchStrings.
- // Ownership of the contents of inStringList is NOT transferred. The contents of
- // inStringList must exist as long as this instance of DMultiStringLocator exists.
-
- DMultiStringLocator::DMultiStringLocator
- ( LList &inStringList // list of search strings
- , int inAlphabetSize // size of BranchTable
- )
- : mAlphabetSize( inAlphabetSize )
- {
- int theChar;
- stateT theState;
- DSearchString *stringP = NULL;
-
- if ( NULL == &inStringList ) throw ( input_item_NULL() ); // validate input
-
- // compute maximum number of possible states by adding lengths of all strings
- mMaxState = 1;
- LListIterator stringScan ( inStringList, iterate_FromStart );
- while ( stringScan.Next ( &stringP ) )
- {
- mMaxState += stringP->Length();
- }
-
- // alloc arrays
- MatchArray = new int[mMaxState]; // list of matching chars
- GotoArray = new GotoTable[mMaxState]; // list of next states or branch ptr
- // GotoArray is indexed by MatchArray contents
- RetryArray = new stateT[mMaxState]; // list of retry states
- OutArray = new LList[mMaxState]; // list of pointers to recognizable words
-
- // initialize arrays
- for ( theState = state_begin; theState < mMaxState; ++theState )
- {
- MatchArray[theState] = EMPTY_SLOT;
- }
-
- // initialize state_array[state_begin]
- mHighState = state_begin;
- AddStateTrans ( 'a', state_begin, FAIL_STATE );
- AddStateTrans ( 'b', state_begin, FAIL_STATE ); // force a BranchTable
-
- // add each string to the state engine
- stringScan.ResetTo ( iterate_FromStart );
- while ( true == stringScan.Next ( &stringP ) )
- {
- if ( NULL != stringP ) AddString ( *stringP );
- }
-
- // setup return to zero transitions for state[state_begin]
- for ( theChar = 0; theChar < mAlphabetSize; ++theChar )
- {
- if ( FAIL_STATE == GotoArray[state_begin].BranchTable[theChar] )
- {
- GotoArray[state_begin].BranchTable[theChar] = state_begin;
- }
- }
-
- // compute failure array
- RetryArrayInit();
- }
-
-
- // ---------------------------------------------------------------------------
- // Search a ram-based buffer.
- //
- // Returns an int indicating the maximum number of characters that could be
- // part of a match at the end of the buffer. This indicates what number of
- // bytes should be copied to the next buffer if there is another buffer to
- // search. This does not account for found strings embedded in the partial
- // match string. Such embedded strings will be found twice. Note that this
- // problem occurs only when a complete search is split over multiple calls.
- //
- // (I'm not yet sure how best to correct this design flaw. Currently I ignore
- // it, but one could work around it by checking the buffer location of each
- // match to see if it is a duplicate.)
-
- int
- DMultiStringLocator::SearchBuffer ( Uchar *inBufferP, int inSize )
- {
- unsigned char theChar; // comparison character
- stateT currentState, nextState;
- int matchChar, charLength = 0;
- Uchar *theBufferP = inBufferP;
- Boolean keepGoing = true;
-
- // validate inputs
- ThrowIfNil_( inBufferP );
-
- currentState = state_begin;
- while ( keepGoing && ( theBufferP < inBufferP + inSize )) // for each character...
- {
- theChar = *theBufferP++;
- charLength += 1; // tracking for partial match at end of buffer
- for ( ;; ) // determine the next state
- {
- if ( ( state_begin == currentState ) ) // branch shortcut
- {
- charLength = 1; // assumes new strings always start at state_begin
- nextState = GotoArray[currentState].BranchTable[theChar];
- }
- else if ( BRANCH == ( matchChar = MatchArray[currentState] )) // branch
- {
- nextState = GotoArray[currentState].BranchTable[theChar];
- }
- else if ( matchChar == theChar ) // single char
- {
- nextState = GotoArray[currentState].GotoState;
- }
- else // empty slot
- {
- nextState = FAIL_STATE;
- }
-
- if ( FAIL_STATE != nextState ) break;
-
- currentState = RetryArray[currentState]; // goto another word containing from char
- Assert_( currentState < mMaxState );
- }
- currentState = nextState;
- Assert_( currentState < mMaxState );
-
- // scan OutArray to see if a complete word is found
- if ( OutArray[currentState].GetCount() > 0 ) // list not empty
- {
- // report for each string of the recognized string list
- DSearchString *reportItemP = NULL;
- LListIterator foundList ( OutArray[currentState], iterate_FromStart );
- while ( foundList.Next ( &reportItemP ) ) // for each output string...
- {
- if ( NULL == reportItemP ) throw ( list_item_NULL() );
- keepGoing = reportItemP->ReportFound( (theBufferP-inBufferP) );
- }
- }
- }
- // Pass along the request to cancel the search.
- if ( !keepGoing ) charLength = searchStatus_stop;
-
- // Return indication of what part of the buffer should be reused.
- return charLength;
- }
-
-
- // ---------------------------------------------------------------------------
- // Destructor
-
- DMultiStringLocator::~DMultiStringLocator()
- {
- stateT theState;
-
- // search and destroy BranchTables
- for ( theState = state_begin; theState < mMaxState; ++theState )
- {
- if ( BRANCH == MatchArray[theState] )
- {
- delete[] ( GotoArray[theState].BranchTable );
- }
- }
-
- // destroy other arrays
- delete[] ( MatchArray );
- delete[] ( GotoArray );
- delete[] ( RetryArray );
- delete[] ( OutArray ); // listed objects are just pointers
- }
-
-
- // ===========================================================================
- // ----------- private member function implementations ----------
- // ===========================================================================
- #pragma mark --- private members ---
- // ---------------------------------------------------------------------------
- // Add inString to list of recognizable strings
-
- void
- DMultiStringLocator::AddString( DSearchString &inString )
- {
- stateT currentState, nextState;
- Uint8 charPos;
-
- currentState = state_begin;
-
- if ( NULL == &inString ) throw ( input_item_NULL() ); // validate input
-
- // attempt to place new word on an old word
- for ( charPos = 1; charPos <= inString.Length(); ++charPos ) // for each char...
- {
- if ( inString[charPos] == MatchArray[currentState] ) // single char slot
- {
- currentState = GotoArray[currentState].GotoState;
- Assert_( currentState < mMaxState );
- }
- else if ( BRANCH == MatchArray[currentState] ) // branch
- {
- if ( FAIL_STATE ==
- ( nextState = GotoArray[currentState].BranchTable[inString[charPos]] ))
- {
- break; // char not part of a stored word
- }
- else // a transition for this character
- {
- currentState = nextState;
- }
- Assert_( currentState < mMaxState );
- }
- else // empty slot
- {
- break; // no match for this character
- }
- }
-
- // add new states as needed
- for ( ; charPos <= inString.Length(); ++charPos ) // for each char...
- {
- mHighState += 1;
- if ( mHighState >= mMaxState ) throw ( state_table_exceeded() );
-
- AddStateTrans ( inString[charPos], currentState, mHighState );
- currentState = mHighState;
- }
-
- // Place a copy of the pointer to the string in the output list
- if ( 0 < inString.Length() )
- {
- OutArray[currentState].InsertItemsAt( 1, arrayIndex_Last, &(&inString) );
- }
- }
-
-
- // ---------------------------------------------------------------------------
- // Add transition for matchChar from currentState to nextState
-
- void
- DMultiStringLocator::AddStateTrans ( int matchChar, stateT currentState, stateT nextState )
- {
- int theChar;
- stateT *newBranchTable;
-
- // validate inputs
- Assert_( currentState < mMaxState );
- Assert_( nextState < mMaxState );
-
- if ( EMPTY_SLOT == MatchArray[currentState] ) // empty slot
- {
- MatchArray[currentState] = matchChar;
- GotoArray[currentState].GotoState = nextState;
- }
- else if ( BRANCH == MatchArray[currentState] ) // branch
- {
- GotoArray[currentState].BranchTable[matchChar] = nextState;
- }
- else // single char
- {
- newBranchTable = new stateT[mAlphabetSize];
- for ( theChar = 0; theChar < mAlphabetSize; ++theChar )
- {
- newBranchTable[theChar] = FAIL_STATE;
- }
-
- // copy data from single-way branch to char position of newBranchTable
- newBranchTable [ MatchArray [ currentState ] ]
- = GotoArray[ currentState ].GotoState;
-
- // copy new data
- newBranchTable[matchChar] = nextState;
-
- // load state_array
- MatchArray[currentState] = BRANCH;
- GotoArray[currentState].BranchTable = newBranchTable;
- }
- }
-
-
- // ---------------------------------------------------------------------------
- // build RetryArray and update GotoArray
-
- void
- DMultiStringLocator::RetryArrayInit( void )
- {
- stateT *fifo, fifoTop;
- stateT nextState, currentState;
- int theChar;
-
- // create fifo
- fifo = new stateT[mMaxState]; // a FIFO list of possible states
- if ( NULL == fifo ) throw ( memerror() );
- fifoTop = 0;
- fifo[fifoTop] = 0;
-
- // initialize RetryArray with first level BranchTable states
- for ( theChar = 0; theChar < mAlphabetSize; ++theChar )
- {
- if ( state_begin !=
- ( currentState = GotoArray[state_begin].BranchTable[theChar] ))
- {
- Assert_( currentState < mMaxState );
- RetryArray[currentState] = state_begin;
- QueueAdd (fifo, fifoTop, currentState );
- }
- }
-
- // update RetryArray with states from scan of lower levels
- while ( 0 != fifo[fifoTop] ) // for each state in the fifo...
- {
- // grab state from front of fifo and advance fifoTop to next item
- nextState = fifo[fifoTop];
- fifoTop = nextState; // effectively drops retrieved item from the fifo
- Assert_( nextState < mMaxState );
-
- // examine this state
- if ( EMPTY_SLOT == MatchArray[nextState] )
- {
- continue;
- }
- else if ( BRANCH == MatchArray[nextState] )
- {
- // scan subsidiary states
- for ( theChar = 0; theChar < mAlphabetSize; ++theChar ) // scan branch table
- {
- if ( FAIL_STATE !=
- ( currentState = GotoArray[nextState].BranchTable[theChar] ))
- {
- Assert_( currentState < mMaxState );
- // add new state to the fifo
- QueueAdd ( fifo, fifoTop, currentState );
- FindRetryState ( theChar, RetryArray[nextState], currentState );
- }
- }
- }
- else // single character
- {
- QueueAdd ( fifo, fifoTop, GotoArray[nextState].GotoState );
- FindRetryState ( MatchArray[nextState]
- , RetryArray[nextState], GotoArray[nextState].GotoState );
- }
- }
-
- delete[] ( fifo );
- }
-
-
- // ---------------------------------------------------------------------------
- // add inItem to end of fifo (a FIFO list)
- // fifo is actually a list implemented in an array. Each item points to the
- // next intended item. Items are not necessarily added in order (maybe?).
-
- void
- DMultiStringLocator::QueueAdd( int *fifo, int fifoTop, stateT inState )
- {
- const stateT fifo_end = 0;
- stateT nextState;
-
- // validate inputs
- Assert_( inState < mMaxState );
- if ( NULL == fifo ) throw ( input_item_NULL() );
-
- nextState = fifo[fifoTop];
- if ( fifo_end == nextState ) // list is empty
- {
- fifo[fifoTop] = inState;
- }
- else
- {
- for ( ; fifo_end != fifo[nextState]; nextState = fifo[nextState] )
- {
- ; // walk to the next open slot
- Assert_( nextState < mMaxState ); // don't walk off the end of the list
- }
- fifo[nextState] = inState; // insert state
- }
- fifo[inState] = fifo_end; // terminate list
- }
-
-
- // ---------------------------------------------------------------------------
- // Compute failure transition.
- //
- // inChar would normally cause a change from currentState to nextState. To
- // compute the retry value, look back in search of other places inChar might go.
- //
- // This appears to assume inChar is either a single char match or a branch.
-
- void
- DMultiStringLocator::FindRetryState
- ( const int inChar
- , stateT currentState
- , const stateT nextState
- )
- {
- stateT on_fail;
-
- // validate inputs
- Assert_( currentState < mMaxState );
- Assert_( nextState < mMaxState );
-
- // locate embedded words by following RetryArray until it does not link to FAIL_STATE
- for ( ; ; currentState = RetryArray[currentState] )
- {
- Assert_( currentState < mMaxState );
- if ( inChar == MatchArray[currentState] ) // desired single char
- {
- if ( FAIL_STATE != ( on_fail = GotoArray[currentState].GotoState )) break;
- }
- else if ( BRANCH == MatchArray[currentState] ) // branch
- // 960630: Original code (below) fails when searching for { nil, int, Item }.
- // else if ( EMPTY_SLOT != MatchArray[currentState] )
- // Why was this not an explicit BRANCH test?
- {
- if ( FAIL_STATE !=
- ( on_fail = GotoArray[currentState].BranchTable[inChar] )) break;
- }
- }
-
- Assert_( on_fail < mMaxState );
- RetryArray[nextState] = on_fail;
-
- // --- merge output lists
- // duplicate each item of OutArray[on_fail] to end of list OutArray[nextState]
- // ••• When can there every really be two words with the same end state?
- if ( OutArray[on_fail].GetCount() > 0 ) // list not empty
- {
- void *itemP = NULL;
- LListIterator failList ( OutArray[on_fail], iterate_FromStart );
- while ( failList.Next ( &itemP ) ) // for each on_fail output string...
- {
- OutArray[nextState].InsertItemsAt( 1, arrayIndex_Last, &itemP );
- }
- }
- }
-
-
- // ===========================================================================
-